#Convert Dataframe to corpus
# Use tm package convert dataframe to corpus: https://www.rdocumentation.org/packages/tm/versions/0.7-8/topics/DataframeSource
doc_id = c(1:4619)
text_df <- data.frame(doc_id, text = Tweets_state$text, stringsAsFactors = FALSE)
# Convert example_text to a corpus: Success_corpus
tweets_corpus <- VCorpus(DataframeSource(text_df))
corpus <- tm_map(tweets_corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
corpus <- tm_map(corpus, content_transformer(function(x) gsub("[[:cntrl:]]", "", x))) #remove control characters
corpus <- tm_map(corpus, content_transformer(function(x) gsub("http\\S+", "", x))) #remove website addresses
corpus <- tm_map(corpus, content_transformer(function(x) gsub("@[A-Za-z0-9]+", "", x))) #remove mentions in text
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
#Document Term Matrix of tweets
tweets_dtm <- DocumentTermMatrix(corpus)
#Convert DTM to tidy dataframe
tweets_tidy <- tidy(tweets_dtm) %>%
mutate(index = as.numeric(document)) %>%
left_join(Tweets_state, by=c("index"="id"))
# Get Bing lexicon
bing <- get_sentiments("bing")
# Join text to lexicon
Tweets_bing <- inner_join(tweets_tidy, bing, by = c("term" = "word")) %>%
# Count by sentiment, index, document
count(sentiment,index,document, text) %>%
# Spread sentiments
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive-negative)
# Join sentiment score with original twitter data fame
Tweets_sentiment <- Tweets_state %>%
inner_join(Tweets_bing, by = c("id" = "index")) %>%
select(-c(text.y,document, size=favorite_count))
Pos_Neg <- ggplot(Tweets_sentiment, aes(x=positive,y=negative)) +
geom_jitter(alpha=0.8, color="rosybrown3") +
theme_minimal()
png("Pos_Neg.png") #Saving plot to png
print(Pos_Neg)
dev.off()
## quartz_off_screen
## 2
#Let’s make a choropleth map based on sentiment score by state
# count by state
tw_state <- Tweets_sentiment %>%
mutate(state = ifelse(state=="new york:long island","new york",
ifelse(state=="new york:main","new york",
ifelse(state=="new york:main","new york",
ifelse(state=="new york:manhattan","new york",
ifelse(state=="north carolina:main","north carolina",
ifelse(state=="massachusetts:main","massachusetts",
ifelse(state=="michigan:north","michigan",
ifelse(state=="michigan:south","michigan",
ifelse(state=="virginia:main", "virginia",
ifelse(state=="washington:main","washington",state))))))))))) %>%
group_by(state) %>%
mutate(N=n(), T.Sentiment=sum(sentiment), WordCounts = sum(display_text_width)) %>% #get total number of tweets & sum of sentiment score by state
select(display_text_width, lon, lat, state, T.Sentiment, N, WordCounts) %>% #lon & lat are vary within each state.Keep first one.
group_by(state, lon, lat) %>%
summarise(Avg.Sentiment=T.Sentiment/N, WordCounts=WordCounts) %>% #calculate average sentiment score by state and keep total word counts by state
ungroup(lon,lat) %>%
filter(row_number()==1) %>%
mutate(lon=ifelse(state=="new york",-73.935242, lon)) %>% #change coordinates for New York State
mutate(lat=ifelse(state=="new york",40.730610,lat)) %>%
filter(!is.na(state))
## `summarise()` has grouped output by 'state', 'lon', 'lat'. You can override using the `.groups` argument.
# Polygon stuff from shape file
#install.packages("tigris")
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
states <- states(cb=T) %>%
mutate(name = tolower(NAME))
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================= | 46%
|
|================================== | 48%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 62%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 89%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|======================================================================| 100%
# Use the Tigris function geo_join to bring together the states shapefile and the tw_states dataframe
states_shape_tweets <- geo_join(states, tw_state, "name", "state")
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Creating a color palette based on the number range in the total column
pal <- colorNumeric("YlOrRd", domain=states_shape_tweets$Avg.Sentiment)
# Setting up the pop up text
popup_sb <- paste0("State: ", as.character(states_shape_tweets$NAME),"<br/>",
"Average Sentiment Score: ", as.character(states_shape_tweets$Avg.Sentiment),"<br/>",
"Total Word Count: ",as.character(states_shape_tweets$WordCounts),"<br/>")
# Mapping it with the new tiles CartoDB.Positron
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
setView(-98.483330, 38.712046, zoom = 4) %>%
addPolygons(data = states_shape_tweets ,
fillColor = ~pal(states_shape_tweets$Avg.Sentiment),
fillOpacity = 0.7,
weight = 1.5,
opacity = 1,
color = "cornsilk",
dashArray = "3",
smoothFactor = 0.2,
popup = ~popup_sb,
highlight = highlightOptions(weight = 4,
color = "#839EA8",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
popupOptions = popupOptions(style = list("font-weight" = "normal",
padding = "3px 8px",
"box-shadow" = "3px 3px rgba(0,0,0,0.25)"),
textsize = "15px",
direction = "auto")) %>%
addLegend(pal = pal,
values = states_shape_tweets$Avg.Sentiment,
position = "bottomleft",
title = "Average Sentiment Score",
opacity = 0.7)
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'
## Warning in validateCoords(lng, lat, funcName): Data contains 7 rows with either
## missing or invalid lat/lon values and will be ignored